library(data.table)
library(Rtsne)
library(R.utils)
library(umap)
library(ggplot2)
theme_set(theme_bw())
fnm1 = "GSE120575_Sade_Feldman_melanoma_single_cells_nolabel_1.txt.gz"
fnm2 = "GSE120575_Sade_Feldman_melanoma_single_cells_2.txt.gz"
sf_tpm1 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm1),
fill = TRUE, header = TRUE)
sf_tpm2 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm2),
fill = TRUE, drop = 16293, col.names = colnames(sf_tpm1))
ls_tpm = list(sf_tpm1,sf_tpm2)
sf_tpm = rbindlist(ls_tpm)
rm(ls_tpm)
rm(sf_tpm1)
rm(sf_tpm2)
dim(sf_tpm) # 55737 x 16292
## [1] 55737 16292
sf_tpm[1:2,1:5]
cell.names = colnames(sf_tpm)[-1]
gene.names = sf_tpm$V1
rownames(sf_tpm) = gene.names
sf_tpm$V1 = NULL
dim(sf_tpm)
## [1] 55737 16291
sf_tpm[1:2,1:5]
xnm = "1-s2.0-S0092867418313941-mmc2.xlsx"
table_s2 = readxl::read_xlsx(paste0("../../_reference/Sade_Feldman_2018/", xnm),
sheet = "Cluster annotation-Fig2A-B", col_names = TRUE)
dim(table_s2)
## [1] 6350 2
table_s2[1:2,]
table_s2 = as.data.frame(table_s2)
table(table_s2$`Cell Name` %in% cell.names)
##
## TRUE
## 6350
table(table_s2$Cluster)
##
## CD8_B CD8_G
## 3023 3327
CD8T = table_s2$`Cell Name`
sf_CD8T_tpm = sf_tpm[ , ..CD8T]
rownames(sf_CD8T_tpm) = gene.names
rm(sf_tpm)
gene.grouping = rep(1:28, each = 2000)[1:dim(sf_CD8T_tpm)[1]]
cells.per.gene = NULL
for (i in 1:28) {
temp.counts = apply(sf_CD8T_tpm[gene.grouping==i,], 1, FUN = function(x) sum(x>0))
cells.per.gene = c(cells.per.gene,temp.counts)
}
table(cells.per.gene == 0)
##
## FALSE TRUE
## 46790 8947
table(cells.per.gene == 1)
##
## FALSE TRUE
## 51118 4619
hist(log10(cells.per.gene), main = "CD8 T cells (CD8_B + CD8_G)",
xlab = "log10(number of cells) each gene expressed", breaks=50)
rm(gene.grouping)
plot the variance of the expression levels for each gene against its mean expression level
# kept 11662 genes
large.express.genes = gene.names[which(cells.per.gene > 200)]
large.expressed.tpm = sf_CD8T_tpm[match(large.express.genes,rownames(sf_CD8T_tpm)),]
dim(large.expressed.tpm)
## [1] 11662 6350
df_mv = data.frame(gene=large.express.genes, mean=rowMeans(large.expressed.tpm),
var=apply(large.expressed.tpm, 1, var))
rm(large.express.genes)
rm(large.expressed.tpm)
ggplot(data = df_mv) +
geom_point(aes(x=mean,y=var),size = 0.1)
divide the genes into 20 equal-size bins based on their mean expression levels, and select the first 50 genes with the highest variance from each bin.
df_mv$grouping = cut_number(df_mv$mean, n=20)
table(df_mv$grouping)
##
## [0.0214,0.157] (0.157,0.21] (0.21,0.252] (0.252,0.295] (0.295,0.344]
## 584 583 583 583 583
## (0.344,0.397] (0.397,0.457] (0.457,0.525] (0.525,0.602] (0.602,0.685]
## 583 583 583 583 583
## (0.685,0.792] (0.792,0.909] (0.909,1.04] (1.04,1.2] (1.2,1.41]
## 583 583 583 583 583
## (1.41,1.66] (1.66,2.04] (2.04,2.66] (2.66,3.91] (3.91,16.4]
## 583 583 583 583 584
hv.genes.list = by(df_mv, df_mv$grouping,
FUN = function(df) return(df$gene[order(df$var,decreasing = TRUE)][1:50]))
hv.genes = as.character(unlist(hv.genes.list))
rm(hv.genes.list)
hvg_tpm = sf_CD8T_tpm[match(hv.genes,rownames(sf_CD8T_tpm)),]
hvg_tpm = t(hvg_tpm)
colnames(hvg_tpm) = hv.genes
rm(df_mv)
hvg_tpm_log = log(hvg_tpm + 1)
pca.hvg = princomp(hvg_tpm_log, cor = TRUE)
eigvals = pca.hvg$sdev^2
pve = eigvals/sum(eigvals)
plot(1:50, pve[1:50], main = "scree plot (first 50 PCs)", type="b",
xlab="i-th PC",ylab="Proportion of Variance Explained")
rm(eigvals)
yvec = cumsum(pve)
plot(yvec, type="l", main = "plot of cumulative proportion of variance",
xlab = "i-th Principal Components",
ylab = "Cumulative variance", ylim=c(0,1))
abline(h=0.8, lty = 2)
plot(yvec[1:100], type="l", main = "plot of cumulative proportion of variance",
xlab = "i-th Principal Components",
ylab = "Cumulative variance")
pca.scores = pca.hvg$scores
rm(pca.hvg)
top50pcs = pca.scores[,1:50]
rm(pca.scores)
set.seed(9999)
tsne = Rtsne(top50pcs, pca=FALSE)
df_tsne = as.data.frame(tsne$Y)
names(df_tsne) = paste0("topPC_TSNE",seq(ncol(tsne$Y)))
rm(tsne)
pcs_umap = umap(top50pcs)
df_umap = as.data.frame(pcs_umap$layout)
names(df_umap) = paste0("topPC_UMAP",seq(ncol(df_umap)))
rm(pcs_umap)
Prepare dataframe for k-means clustering using top 50 PCs
df_top50PCs = cbind(top50pcs, df_tsne)
df_top50PCs = cbind(df_top50PCs, df_umap)
rm(df_tsne)
rm(df_umap)
dim(df_top50PCs)
## [1] 6350 54
df_top50PCs[1:2, c(1:2,40:52)]
rm(top50pcs)
Try Kmeans for 2 to 8 clusters
set.seed(9999)
all_num_clust = c(2:8)
for (num_clust in all_num_clust) {
cat(paste0("KM with ",num_clust," clusters.\n"))
kmeans_out = kmeans(df_top50PCs[,1:50], centers = num_clust,
iter.max = 1e3, nstart = 50, algorithm = "MacQueen")
print(kmeans_out[c("betweenss","tot.withinss")])
km_label = paste0("KM_",num_clust)
df_top50PCs[[km_label]] = as.factor(kmeans_out$cluster)
pt1 = ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_TSNE1,y=topPC_TSNE2,color=eval(as.name(paste(km_label)))),
size = 0.3) +
scale_colour_discrete(name=paste(km_label)) +
guides(color = guide_legend(override.aes = list(size = 2)))
pt2 = ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=eval(as.name(paste(km_label)))),
size = 0.3) +
scale_colour_discrete(name=paste(km_label)) +
guides(color = guide_legend(override.aes = list(size = 2)))
print(pt1)
print(pt2)
rm(kmeans_out)
rm(pt1)
rm(pt2)
}
## KM with 2 clusters.
## $betweenss
## [1] 184844.6
##
## $tot.withinss
## [1] 831430.6
## KM with 3 clusters.
## $betweenss
## [1] 254609.9
##
## $tot.withinss
## [1] 761665.3
## KM with 4 clusters.
## $betweenss
## [1] 276162.9
##
## $tot.withinss
## [1] 740112.4
## KM with 5 clusters.
## $betweenss
## [1] 293896.1
##
## $tot.withinss
## [1] 722379.1
## KM with 6 clusters.
## $betweenss
## [1] 310972.3
##
## $tot.withinss
## [1] 705303
## KM with 7 clusters.
## $betweenss
## [1] 323238.4
##
## $tot.withinss
## [1] 693036.9
## KM with 8 clusters.
## $betweenss
## [1] 332961.8
##
## $tot.withinss
## [1] 683313.5
# CD8_G cluster and CD8_B cluster label for each CD8+ T cell
df_top50PCs$CD8T = as.factor(table_s2$Cluster)
# 6 CD8+ T cell clusters in Figure 4
xnm = "1-s2.0-S0092867418313941-mmc4.xlsx"
table_s4 = readxl::read_xlsx(paste0("../../_reference/Sade_Feldman_2018/", xnm),
sheet = "Cluster annotation-Fig4A-B", col_names = TRUE)
dim(table_s4)
## [1] 6350 2
table_s4[1:2,]
table(table_s4$Cluster)
##
## CD8_1 CD8_2 CD8_3 CD8_4 CD8_5 CD8_6
## 601 856 1744 980 808 1361
table(rownames(df_top50PCs) == table_s4$`Cell Name`)
##
## TRUE
## 6350
df_top50PCs$sf_cluster = as.factor(table_s4$Cluster)
ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_TSNE1,y=topPC_TSNE2,color=CD8T),size = 0.3) +
guides(color = guide_legend(override.aes = list(size = 2)))
ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=CD8T),size = 0.3) +
guides(color = guide_legend(override.aes = list(size = 2)))
In the K-means analysis result using 3 clusters, the cluster of CD8_B cells and that of CD8_D cells could be roughly identified. Cluster 1 contains all CD8_B cells, cluster 2 contains mostly CD8_B cells, and cluster 3 contains mostly CD8_G cells.
ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=KM_3),size = 0.3) +
guides(color = guide_legend(override.aes = list(size = 2)))
tb1 = table(df_top50PCs$KM_3, df_top50PCs$CD8T)
tb1
##
## CD8_B CD8_G
## 1 672 3046
## 2 425 0
## 3 1926 281
CD8B_prop = tb1[,"CD8_B"]/rowSums(tb1)
CD8G_prop = tb1[,"CD8_G"]/rowSums(tb1)
CD8B_prop
## 1 2 3
## 0.1807423 1.0000000 0.8726778
CD8G_prop
## 1 2 3
## 0.8192577 0.0000000 0.1273222
CD8B_cluster = which(CD8B_prop > 0.8)
CD8G_cluster = which(CD8G_prop > 0.8)
table_s2$ruoyi_cluster = rep(NA, nrow(table_s2))
table_s2$ruoyi_cluster[df_top50PCs$KM_3 %in% CD8B_cluster & df_top50PCs$CD8T == "CD8_B"] = "CD8_B"
table_s2$ruoyi_cluster[df_top50PCs$KM_3 %in% CD8G_cluster & df_top50PCs$CD8T == "CD8_G"] = "CD8_G"
table(table_s2$ruoyi_cluster, table_s2$Cluster, useNA = "ifany")
##
## CD8_B CD8_G
## CD8_B 2351 0
## CD8_G 0 3046
## <NA> 672 281
write.csv(table_s2, "output/CD8_cluster.csv", quote = FALSE, row.names = FALSE)
ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=sf_cluster),size = 0.3) +
guides(color = guide_legend(override.aes = list(size = 2)))
K-means analysis using 8 clusters have some overlap with the CD8+ T cells clusters defined by the paper.
ggplot(data=df_top50PCs) +
geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=KM_8),size = 0.3) +
guides(color = guide_legend(override.aes = list(size = 2)))
table(df_top50PCs$KM_8, df_top50PCs$sf_cluster)
##
## CD8_1 CD8_2 CD8_3 CD8_4 CD8_5 CD8_6
## 1 5 4 9 365 19 604
## 2 13 435 888 77 121 151
## 3 55 273 650 20 3 51
## 4 145 0 0 0 0 0
## 5 216 15 19 1 0 0
## 6 0 106 103 414 651 351
## 7 167 17 13 2 0 1
## 8 0 6 62 101 14 203
rm(df_top50PCs)
rm(sf_CD8T_tpm)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2099182 112.2 5444354 290.8 NA 5444354 290.8
## Vcells 108023654 824.2 1088285418 8303.0 32768 2125549550 16216.7
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.3.1 umap_0.2.7.0 R.utils_2.9.2 R.oo_1.23.0
## [5] R.methodsS3_1.8.0 Rtsne_0.15 data.table_1.12.8
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3 RSpectra_0.16-0 cellranger_1.1.0 compiler_3.6.2
## [5] pillar_1.4.3 tools_3.6.2 digest_0.6.23 jsonlite_1.6.1
## [9] evaluate_0.14 lifecycle_0.2.0 tibble_3.0.1 gtable_0.3.0
## [13] lattice_0.20-38 pkgconfig_2.0.3 rlang_0.4.6 Matrix_1.2-18
## [17] yaml_2.2.1 xfun_0.12 withr_2.1.2 dplyr_0.8.4
## [21] stringr_1.4.0 knitr_1.28 vctrs_0.3.0 askpass_1.1
## [25] tidyselect_1.0.0 grid_3.6.2 reticulate_1.15 glue_1.3.1
## [29] R6_2.4.1 readxl_1.3.1 rmarkdown_2.1 farver_2.0.3
## [33] purrr_0.3.3 magrittr_1.5 scales_1.1.0 htmltools_0.4.0
## [37] ellipsis_0.3.0 assertthat_0.2.1 colorspace_1.4-1 labeling_0.3
## [41] stringi_1.4.5 openssl_1.4.1 munsell_0.5.0 crayon_1.3.4